/*------------------------------------------------------------------------------

*** PURPOSE: 	This do-file replicates the analysis for the IMMANA survey data  
				to reproduce the figures and tables in the paper 
				"Respondent biases in household surveys" by Andrew Dillon and 
				Edouard Romeo Mensah. 
				
				1. It uses the dataset titled "surveydata.dta", contained in the 
				   folder titled "01_data" within the replication package.
				2. All figures and tables created in this do-file are saved in the
				   folder titled "04_output" within the replication package.
				3. A log of the code is stored in the folder titled "03_log".
				

*** OUTLINE: 	Section A. DESCRIPTIVE STATISTICS
					1. Covariates
							Household head's characteristics  
							Housing and HH asset characteristics
					2. Outcomes
							Land characteristics
							Crop portfolio
							Production inputs
							Labor characteristics
							Output and yield
							Value of output and productivity
							Commercialization
					
				Section B. BALANCING TESTS
					1. Appendix Table 1: Balancing tests on hh head’s demographics,
								housing, and asset characteristics  
					2. Appendix Table 2: Between treated balancing tests
								on respondent characteristics
					
				Section C. TIME OF COMPLETION OF QUESTIONNAIRES
					1. Figure 2: Number of daily  visits to complete the whole questionnaire
					2. Figure 3: Time (minutes) for completing the agriculture questionnaire module
					
				Section D. TREATMENT EFFECTS
					1. Table 1: Land characteristics
					2. Table 2, Appendix Table 3: Crop portfolio
					3. Table 3: Production inputs
					4. Table 4, Appendix Table 4: Labor characteristics
					5. Table 5: Output and yield
					6. Appendix Table 5: Commercialization
					7. Appendix Table 6: Value of output and productivity
							
				Section E. GENDER DISAGGREGATED EFFECTS
				Table 6: Gender effects on agricultural outcomes
				
------------------------------------------------------------------------------*/


* Set up Stata.

	cls
	clear all
	clear matrix
	capture log close
	set more off
    macro drop _all
    version 17
	* set maxvar 30000


* Set directory and log location. 

/*	
	Here, insert the directory for the folder "Proxy Ag -- Replication Package",
	which contains the folders "01_data", "02_do", "03_log", and "04_output". 			
*/ 

	cd "" 
	
	log using "03_log\main.txt", text replace

	

* Create working sample.

	use "01_data\surveydata.dta", clear
	misstable sum HHplotsize
	
	// 15 HHs will be dropped. 

	* Sample size by province.
	ta province treatarm, m
	ta province treatarm if HHplotsize!=., m
	
	* Sample size by village.
	ta village treatarm, m
	ta village treatarm if HHplotsize!=., m
	
	keep if HHplotsize!=.
	
	
* Create indicators for study arms.

	descr treat*
	ta treatarm treatcomp, m
	
	gen control = (treatarm == 0)
	la var control "Control"
	
	gen treat1_hhead = (treatarm == 1)
	la var treat1_hhead "HH head treatment"
	
	gen treat2_rproxy = (treatarm == 2)
	la var treat2_rproxy "Random proxy treatment"
	
	order uhhid-treatcomp control treat1_hhead treat2_rproxy *time times_5 times_10 , first
	
	
	
********************************************************************************
*						  A. DESCRIPTIVE STATISTICS 						   *
********************************************************************************

	qui do "02_do\01_main tables\descriptivestats"
	
	* Tabulate average time of completion of questionnaires.
	descr *time*
	codebook *time*

	* Generate variable with length of the whole questionnaire in days.
	gen startd_full		= dofc(starttime)
	gen endd_full		= dofc(endtime)
	gen time_full_days	= endd_full-startd_full
	ta time_full_days treatarm, m

	* Generate variable with length of the whole questionnaire in minutes.
	gen startm_full 	= minutes(cofC(starttime))
	gen endm_full 		= minutes(cofC(endtime))
	gen time_full_ms 	= endm_full - startm_full
	* ta time_full_ms treatarm, m
	
	list starttime endtime time_full_days time_full_ms ///
		treatarm times_5 times_10 if time_full_ms <= 60
		// 21 (1 control & 20 treatment) questionnaires filled out in less than one hour.
	
	sum time_full_days time_full_ms
	sum time_full_ms if time_full_ms <= 240
	sum time_full_days time_full_ms if time_full_ms <= 240 & treatarm > 0
		// 426 questionnaires filled out in less than four hours,
		// 330 of which are treatment questionnaires.
	
	gen qmultiday		= (time_full_ms > 1440)
	ta time_full_days qmultiday
	replace time_full_days = 0 if qmultiday == 0
	la var qmultiday "Whole questionnaire completed in more than 1 day (1440 min.)"    
	la var time_full_days "Number of days spent in the field for completing the whole questionnaire" 
	
	replace time_full_ms = . if qmultiday == 1
	la var time_full_ms "Time (minutes) for completing whole quest. filled in a day" 
	
	gen nvisits_full = time_full_days + 1
	la var nvisits_full "Number of dayly visits to the household to complete the whole questionnaire" 

	* Generate descriptive statistics for the agriculture module -- what matters for this survey experiment (by design).
	for var times_5 times_10: tab1 X treatarm if X == ""
	gen startm_ag 		= minutes(clock(times_5, "20YMD hms"))
	gen endm_ag 		= minutes(clock(times_10, "20YMD hms"))
	gen time_ag_ms 		= endm_ag - startm_ag
	ta time_ag_ms treatarm, m
	
	list times_5 times_10 time_ag_ms treatarm HHplotsize plotcount if time_ag_ms < 0
		// These are questionnaires filled out the same day but ag module questions not answered in order. 
	replace time_ag_ms	= abs(time_ag_ms)	
	ta time_ag_ms treatarm, m	
	la var time_ag_ms "Time (minutes) for completing agriculture module"

	
********************************************************************************
*							B. BALANCING TESTS 								   *
********************************************************************************
		
	* Appendix Table 1. Balancing tests on HH Head’s demographics, housing, and 
	* asset characteristics
		
	qui do "02_do\02_appendix tables\table1-balancingtables-covariates"
		
	* Appendix Table 2. Between treated balancing tests on respondent characteristics

	qui do "02_do\02_appendix tables\table2-balancingtables-respondent"


	
********************************************************************************
*					C. TIME OF COMPLETION OF QUESTIONNAIRES 				   *
********************************************************************************
	
	* Figure 2. Number of daily visits to complete the whole questionnaire

	qui do "02_do\01_main tables\figure1-dailyvisits"
	
	* Figure 3. Time (minutes) for completing the agriculture module

	qui do "02_do\01_main tables\figure2-timeagmodule"

	
	
********************************************************************************
*							D. TREATMENT EFFECTS							   *
********************************************************************************
	
	* Define globals.
	#d ;
	global  depvars1 "hanforest hapasture hacrops hagarden 		
			HHlandsize HHplotsize 
			ma_inherited ma_giftdon st_clayey st_sandy " ;

			
	global  depvars2 "d_cereals d_legumes 
		    d_vegetables d_cashcrops ncrops" ;
			
	
	global 	depvars3 "dasset40 dasset42 dasset64 dasset46 dasset39 dasset62
			dasset63 useferti logfertikgha logfertivalha 
			usePIH logPIHliterha logPIHvalha" ;
			

	global  condlvars "pdayYR_unpaidhhm pdayYR_unpaidhhf 
			pdayYR_unpaidhhc pdayYR_paidnonhhm pdayYR_paidnonhhf
			pdayYR_paidnonhhc pdayYR_unpaidrel pdayYR_unpaidnonrel
			pdayYR_unpaidhhtot adeqYR_unpaidhhtot pdayYR_paidnonhhtot
			adeqYR_paidnonhhtot pdayYR_unpaidnonhhtot pdayYR_childtot
			pdayYR_labortot usdYR_paidnonhhm usdYR_paidnonhhf
			usdYR_paidnonhhc usdYR_labortot w_pdayYR_unpaidhhm
			w_pdayYR_unpaidhhf w_pdayYR_unpaidhhc w_pdayYR_paidnonhhm
			w_pdayYR_paidnonhhf w_pdayYR_paidnonhhc w_pdayYR_unpaidrel
			w_pdayYR_unpaidnonrel w_pdayYR_unpaidhhtot w_adeqYR_unpaidhhtot
			w_pdayYR_paidnonhhtot w_adeqYR_paidnonhhtot w_pdayYR_unpdnonhhtot
			w_pdayYR_childtot w_pdayYR_labortot w_usdYR_paidnonhhm
			w_usdYR_paidnonhhf w_usdYR_paidnonhhc w_usdYR_labortot" ;
	
	foreach x in $condlvars {;
		ren condl_`x' c_`x' ;
	} ;
	
	foreach x in pdayYR_unpaidhhm pdayYR_unpaidhhf pdayYR_unpaidhhc pdayYR_unpaidhhtot
			pdayYR_unpaidrel pdayYR_unpaidnonrel pdayYR_unpaidnonhhtot { ;
		gen l_`x'=log(1+`x') ;
	} ;
	
	foreach x in c_pdayYR_paidnonhhm c_pdayYR_paidnonhhf c_pdayYR_paidnonhhc c_pdayYR_paidnonhhtot 
			c_usdYR_paidnonhhm c_usdYR_paidnonhhf c_usdYR_paidnonhhc c_usdYR_labortot
			pdayYR_childtot pdayYR_labortot usdYR_labortot { ;
		gen l_`x'=log(1+`x') ;
	} ;
	
	foreach x in pdayYR_labortot usdYR_labortot c_usdYR_labortot { ;
		gen l_`x'ha=log(1+(`x'/HHplotsize)) ;
	} ;
	
	global  depvars4 "l_pdayYR_unpaidhhtot l_pdayYR_unpaidnonhhtot
			l_pdayYR_labortot usepaidlabor l_usdYR_labortot" ;


	global  depvars5 "log_output_cereals log_output_legumes
			log_output_vegetables log_output_foodcrops
			log_yield_cereals log_yield_legumes 
			log_yield_vegetables log_yield_foodcrops" ;
	
	
	global  depvars6 "haedge HHlandsize HHplotsize ma_inherited ma_giftdon ma_other 
			useferti logfertikgha logfertivalha usepaidlabor
			d_Rice d_Sorghum d_cereals d_Bean
			d_Sesame d_legumes d_Okra
			d_vegetables d_cashcrops ncrops
			log_usdprod_cereals log_usdprod_legumes 
			log_usdprod_vegetables log_usdprod_foodcrops 
			log_usdprod_cashcrops log_usdprod_total " ;

	
	global  depvars3_app "$cereals $legumes $vegetables2 d_vegetables 
			d_Cotton d_cashcrops ncrops" ;
		
		
	global  depvars4_app "l_pdayYR_unpaidhhm l_pdayYR_unpaidhhf l_pdayYR_unpaidhhc l_pdayYR_unpaidhhtot
			l_pdayYR_unpaidrel l_pdayYR_unpaidnonrel l_pdayYR_unpaidnonhhtot
			usepaidlabor l_pdayYR_childtot l_pdayYR_labortot l_usdYR_labortot 
			l_pdayYR_labortotha l_usdYR_labortotha l_c_usdYR_labortotha" ;
		
		
	global  depvars5_app "commindex_Rice commindex_Millet commindex_Sorghum commindex_Corn
			commindex_cereals commindex_Bean commindex_Peanut commindex_Bambaranut 
			commindex_Sesame commindex_legumes commindex_Tomato commindex_Okra
			commindex_Sorrel commindex_vegetables commindex_foodcrops commindex_Cotton " ;
			
			
	foreach x in cereals legumes vegetables foodcrops cashcrops total { ;
		ren log_usd_voutput_`x' log_usdprod_`x' ;
	} ;
	
	global 	depvars6_app "log_usdprod_cereals log_usdprod_legumes 
			log_usdprod_vegetables log_usdprod_foodcrops 
			log_usdprod_cashcrops log_usdprod_total 
			log_usd_vyield_cereals log_usd_vyield_legumes 
			log_usd_vyield_vegetables log_usd_vyield_foodcrops 
			log_usd_vyield_cashcrops log_usd_vyield_total" ;
	
	
	global depvars "$depvars1 $depvars2 $depvars3 $depvars4 
					$depvars5 $depvars5_app $depvars6_app";
	
	#d cr
			
	recode headgender (1=0) (0=1), gen(headmale)
	la define femalemale 0 "female" 1 "male"
	la val headmale femalemale
	la var headmale "Male-headed HH"
	

	
	* 1. LAND CHARACTERISTICS

	qui do "02_do\01_main tables\table1-land"	


	* 2. CROP PORTFOLIO

 	qui do "02_do\01_main tables\table2-crops_main"
	qui do "02_do\02_appendix tables\table3-crops_app"

	
	* 3. PRODUCTION INPUTS 
			
	qui do "02_do\01_main tables\table3-productioninputs"
				
				
	* 4. LABOR

 	qui do "02_do\01_main tables\table4-labor_main"
	qui do "02_do\02_appendix tables\table4-labor_app"

	
	* 5. OUTPUT AND YIELD

	qui do "02_do\01_main tables\table5-outputandyield"
 				
	
	* 6. COMMERCIALIZATION

	qui do "02_do\02_appendix tables\table5-commercialization"
	
	
	* 7. VALUE OF OUTPUT AND PRODUCTIVITY

	qui do "02_do\02_appendix tables\table6-outputandproductivity"
			


********************************************************************************
*			E. GENDER DISAGGREGATED EFFECTS OF RANDOM PROXY METHOD			   *
********************************************************************************
	
	qui do "02_do\01_main tables\table6-gendereffectsproxyhh"

				

/*----------------------------------------------------------------------------*/
			    